1 Info

Here I train the 16x video classifiers with the merged data, i.e. using stock culture data from every time point and light setting recorded.

2 Dependencies

library(tidyverse)
library(randomForest)
library(readr)
library(viridis)
library(e1071)
library(caret)

set.seed(1)

3 Data

3.1 Normal light

3.1.1 1st data (2020-2021)

colnames <- c("Particle_ID","Area_ABD","Area_Filled","Aspect_Ratio","Average_Blue",
              "Average_Green","Average_Red","Calibration_Factor","Calibration_Image",
              "Camera","Capture_X","Capture_Y","Ch1_Area","Ch1_Peak","Ch1_Width",
              "Ch2_Area","Ch2_Peak","Ch2_Width","Ch2_Ch1_Ratio","Circle_Fit",
              "Circularity","Circularity_Hu","Compactness","Convex_Perimeter",
              "Convexity","Date","Diameter_ABD","Diameter_ESD","Edge_Gradient",
              "Elongation","Feret_Angle_Max","Feret_Angle_Min","Fiber_Curl",
              "Fiber_Straightness","Filter_Score","Geodesic_Aspect_Ratio",
              "Geodesic_Length","Geodesic_Thickness","Image_File","Image_Height",
              "Image_Width","Image_X","Image_Y","Intensity","Length",
              "Particles_Per_Chain","Perimeter","Ratio_Blue_Green","Ratio_Red_Blue",
              "Ratio_Red_Green","Roughness","Scatter_Area","Scatter_Peak","Scatter_Width",
              "Sigma_Intensity","Source_Image","Sum_Intensity","Symmetry","Time",
              "Timestamp","Transparency","Volume_ABD","Volume_ESD","Width","species")

dd_all <- read_csv("../Class_18C_normalLight/1st_class_2020/Data/libraries_FC100/dd_all.csv", show_col_types = FALSE)
# dd_all$species <- ifelse(dd_all$species=="small_unidentified_possibly_small_digested_algae", "Small_unidentified",dd_all$species)

# table(dd_all$species)

3.1.2 2nd data (2022 Feb)

Chlamydomonas <- read_csv("../Class_18C_normalLight/2nd_class_2022Feb/Data/Chlamydomonas_light_February_2022_library.csv", show_col_types = FALSE)
Chlamydomonas$species <- "Chlamydomonas"

Cosmarium <- read_csv("../Class_18C_normalLight/2nd_class_2022Feb/Data/Cosmarium_light_February_2022_library.csv", show_col_types = FALSE)
Cosmarium$species <- "Cosmarium"

Desmodesmus <- read_csv("../Class_18C_normalLight/2nd_class_2022Feb/Data/Desmodesmus_light_February_2022_library.csv", show_col_types = FALSE)
Desmodesmus$species <- "Desmodesmus"

Monoraphidium <- read_csv("../Class_18C_normalLight/2nd_class_2022Feb/Data/Monoraphidium_light_February_2022_library.csv", show_col_types = FALSE)
Monoraphidium$species <- "Monoraphidium"

Staurastrum1 <- read_csv("../Class_18C_normalLight/2nd_class_2022Feb/Data/Staurastrum1_light_February_2022_library.csv", show_col_types = FALSE)
Staurastrum1$species <- "Staurastrum1"

Staurastrum2 <- read_csv("../Class_18C_normalLight/2nd_class_2022Feb/Data/Staurastrum2_light_February_2022_library.csv", show_col_types = FALSE)
Staurastrum2$species <- "Staurastrum2"

dd_all_feb22 <- rbind(Chlamydomonas,Cosmarium,Desmodesmus,
                      Monoraphidium,Staurastrum1,Staurastrum2) 

colnames(dd_all_feb22) <- colnames
  
# table(dd_all_feb22$species)

3.1.3 3rd data (20220316)

Chlamydomonas <- read_csv("../Class_18C_normalLight/3rd_data_20220316/Data/Chlamydomonas_light_March_2022.csv", show_col_types = FALSE)
Chlamydomonas$species <- "Chlamydomonas"

Cosmarium <- read_csv("../Class_18C_normalLight/3rd_data_20220316/Data/Cosmarium_light_March_2022.csv", show_col_types = FALSE)
Cosmarium$species <- "Cosmarium"

Cryptomonas <- read_csv("../Class_18C_normalLight/3rd_data_20220316/Data/Cryptomonas_light_March_2022.csv", show_col_types = FALSE)
Cryptomonas$species <- "Cryptomonas"

Desmodesmus <- read_csv("../Class_18C_normalLight/3rd_data_20220316/Data/Desmodesmus_light_March_2022.csv", show_col_types = FALSE)
Desmodesmus$species <- "Desmodesmus"

Monoraphidium <- read_csv("../Class_18C_normalLight/3rd_data_20220316/Data/Monoraphidium_light_March_2022.csv", show_col_types = FALSE)
Monoraphidium$species <- "Monoraphidium"

Staurastrum1 <- read_csv("../Class_18C_normalLight/3rd_data_20220316/Data/Staurastrum1_light_March_2022.csv", show_col_types = FALSE)
Staurastrum1$species <- "Staurastrum1"

Staurastrum2 <- read_csv("../Class_18C_normalLight/3rd_data_20220316/Data/Staurastrum2_light_March_2022.csv", show_col_types = FALSE)
Staurastrum2$species <- "Staurastrum2"

dd_all_20220316 <- rbind(Chlamydomonas,Cosmarium,Cryptomonas,Desmodesmus,
                      Monoraphidium,Staurastrum1,Staurastrum2) 

colnames(dd_all_20220316) <- colnames
  
# table(dd_all_20220316$species)

3.1.4 4th data (20220406)

Chlamydomonas <- read_csv("../Class_18C_normalLight/4th_data_20220406/Data/Chlamydomonas_light_April_2022.csv", show_col_types = FALSE)
Chlamydomonas$species <- "Chlamydomonas"

Cosmarium <- read_csv("../Class_18C_normalLight/4th_data_20220406/Data/Cosmarium_light_April_2022.csv", show_col_types = FALSE)
Cosmarium$species <- "Cosmarium"

Cryptomonas <- read_csv("../Class_18C_normalLight/4th_data_20220406/Data/Cryptomonas_light_April_2022.csv", show_col_types = FALSE)
Cryptomonas$species <- "Cryptomonas"

Desmodesmus <- read_csv("../Class_18C_normalLight/4th_data_20220406/Data/Desmodesmus_light_April_2022.csv", show_col_types = FALSE)
Desmodesmus$species <- "Desmodesmus"

Monoraphidium <- read_csv("../Class_18C_normalLight/4th_data_20220406/Data/Monoraphidium_light_April_2022.csv", show_col_types = FALSE)
Monoraphidium$species <- "Monoraphidium"

Staurastrum1 <- read_csv("../Class_18C_normalLight/4th_data_20220406/Data/Staurastrum1_light_April_2022.csv", show_col_types = FALSE)
Staurastrum1$species <- "Staurastrum1"

Staurastrum2 <- read_csv("../Class_18C_normalLight/4th_data_20220406/Data/Staurastrum2_light_April_2022.csv", show_col_types = FALSE)
Staurastrum2$species <- "Staurastrum2"

dd_all_20220406 <- rbind(Chlamydomonas,Cosmarium,Cryptomonas,Desmodesmus,
                      Monoraphidium,Staurastrum1,Staurastrum2) 

colnames(dd_all_20220406) <- colnames
  
# table(dd_all_20220406$species)

3.2 Decreasing light

3.2.1 2nd data (2022 Feb) - 18 percent

Chlamydomonas <- read_csv("../Class_18C_decreasingLight/Light_18perc/Data/Chlamydomonas_dark_February_2022_library.csv", show_col_types = FALSE)
Chlamydomonas$species <- "Chlamydomonas"

Cosmarium <- read_csv("../Class_18C_decreasingLight/Light_18perc/Data/Cosmarium_dark_February_2022_library.csv", show_col_types = FALSE)
Cosmarium$species <- "Cosmarium"

Desmodesmus <- read_csv("../Class_18C_decreasingLight/Light_18perc/Data/Desmodesmus_dark_February_2022_library.csv", show_col_types = FALSE)
Desmodesmus$species <- "Desmodesmus"

Monoraphidium <- read_csv("../Class_18C_decreasingLight/Light_18perc/Data/Monoraphidium_dark_February_2022_library.csv", show_col_types = FALSE)
Monoraphidium$species <- "Monoraphidium"

Staurastrum1 <- read_csv("../Class_18C_decreasingLight/Light_18perc/Data/Staurastrum1_dark_February_2022_library.csv", show_col_types = FALSE)
Staurastrum1$species <- "Staurastrum1"

Staurastrum2 <- read_csv("../Class_18C_decreasingLight/Light_18perc/Data/Staurastrum2_dark_February_2022_library.csv", show_col_types = FALSE)
Staurastrum2$species <- "Staurastrum2"

Cryptomonas <- read_csv("../Class_18C_decreasingLight/Light_18perc/Data/Cryptomonas_dark_February_2022_library.csv", show_col_types = FALSE)
Cryptomonas$species <- "Cryptomonas"

dd_all_feb22_dark <- rbind(Chlamydomonas,Cosmarium,Desmodesmus,
                           Monoraphidium,Staurastrum1,Staurastrum2,
                           Cryptomonas) 

colnames(dd_all_feb22_dark) <- colnames
  
# table(dd_all_feb22_dark$species)

3.2.2 3rd data (20220316)

Chlamydomonas <- read_csv("../Class_18C_decreasingLight/Light_6perc/Data/Chlamydomonas_dark_March_2022.csv", show_col_types = FALSE)
Chlamydomonas$species <- "Chlamydomonas"

Cosmarium <- read_csv("../Class_18C_decreasingLight/Light_6perc/Data/Cosmarium_dark_March_2022.csv", show_col_types = FALSE)
Cosmarium$species <- "Cosmarium"

Cryptomonas <- read_csv("../Class_18C_decreasingLight/Light_6perc/Data/Cryptomonas_dark_March_2022.csv", show_col_types = FALSE)
Cryptomonas$species <- "Cryptomonas"

Desmodesmus <- read_csv("../Class_18C_decreasingLight/Light_6perc/Data/Desmodesmus_dark_March_2022.csv", show_col_types = FALSE)
Desmodesmus$species <- "Desmodesmus"

Monoraphidium <- read_csv("../Class_18C_decreasingLight/Light_6perc/Data/Monoraphidium_dark_March_2022.csv", show_col_types = FALSE)
Monoraphidium$species <- "Monoraphidium"

Staurastrum1 <- read_csv("../Class_18C_decreasingLight/Light_6perc/Data/Staurastrum1_dark_March_2022.csv", show_col_types = FALSE)
Staurastrum1$species <- "Staurastrum1"

Staurastrum2 <- read_csv("../Class_18C_decreasingLight/Light_6perc/Data/Staurastrum2_dark_March_2022.csv", show_col_types = FALSE)
Staurastrum2$species <- "Staurastrum2"

dd_all_20220316_dark <- rbind(Chlamydomonas,Cosmarium,Cryptomonas,Desmodesmus,
                              Monoraphidium,Staurastrum1,Staurastrum2) 

colnames(dd_all_20220316_dark) <- colnames
  
# table(dd_all_20220316_dark$species)

rm(Chlamydomonas, Cosmarium, Cryptomonas, Desmodesmus, Monoraphidium, Staurastrum1, Staurastrum2)

3.2.3 4th data (20220406)

Chlamydomonas <- read_csv("../Class_18C_decreasingLight/Light_1perc/Data/Chlamydomonas_dark_April_2022.csv", show_col_types = FALSE)
Chlamydomonas$species <- "Chlamydomonas"

Cosmarium <- read_csv("../Class_18C_decreasingLight/Light_1perc/Data/Cosmarium_dark_April_2022.csv", show_col_types = FALSE)
Cosmarium$species <- "Cosmarium"

Cryptomonas <- read_csv("../Class_18C_decreasingLight/Light_1perc/Data/Cryptomonas_dark_April_2022.csv", show_col_types = FALSE)
Cryptomonas$species <- "Cryptomonas"

Desmodesmus <- read_csv("../Class_18C_decreasingLight/Light_1perc/Data/Desmodesmus_dark_April_2022.csv", show_col_types = FALSE)
Desmodesmus$species <- "Desmodesmus"

Monoraphidium <- read_csv("../Class_18C_decreasingLight/Light_1perc/Data/Monoraphidium_dark_April_2022.csv", show_col_types = FALSE)
Monoraphidium$species <- "Monoraphidium"

Staurastrum1 <- read_csv("../Class_18C_decreasingLight/Light_1perc/Data/Staurastrum1_dark_April_2022.csv", show_col_types = FALSE)
Staurastrum1$species <- "Staurastrum1"

Staurastrum2 <- read_csv("../Class_18C_decreasingLight/Light_1perc/Data/Staurastrum2_dark_April_2022.csv", show_col_types = FALSE)
Staurastrum2$species <- "Staurastrum2"

dd_all_20220406_dark <- rbind(Chlamydomonas,Cosmarium,Cryptomonas,Desmodesmus,
                              Monoraphidium,Staurastrum1,Staurastrum2) 

colnames(dd_all_20220406_dark) <- colnames
  
# table(dd_all_20220406_dark$species)

rm(Chlamydomonas, Cosmarium, Cryptomonas, Desmodesmus, Monoraphidium, Staurastrum1, Staurastrum2)

4 Further data Processing

4.1 Merge data

dd_all$data <- "Late 2020"
dd_all_feb22$data <- "20220201"
dd_all_20220316$data <- "20220316"
dd_all_20220406$data <- "20220406"
dd <- rbind(dd_all,dd_all_feb22,dd_all_20220316,dd_all_20220406)

dd_all_feb22_dark$data <- "20220201"
dd_all_20220316_dark$data <- "20220316"
dd_all_20220406_dark$data <- "20220406"

dd$light <- "30%"
dd_all_feb22_dark$light <- "18%"
dd_all_20220316_dark$light <- "6%"
dd_all_20220406_dark$light <- "1%"
dd <- rbind(dd,dd_all_feb22_dark,dd_all_20220316_dark,dd_all_20220406_dark)
# table(dd$species, dd$data, dd$light)

4.2 filtering out outliers

If an individual is an outlier in more than 4 traits it gets excluded (about 6% gets excluded). Outlier based on boxplot definition.

dd$id <- 1:nrow(dd)
dd_long <- dd %>%
  dplyr::select(species, Area_ABD,Area_Filled,Aspect_Ratio,Average_Blue,
                Average_Green,Average_Red,Circle_Fit,Circularity,
                Compactness,Convexity,Diameter_ABD,Diameter_ESD,Edge_Gradient,
                Elongation,Feret_Angle_Max,Feret_Angle_Min,Fiber_Curl,
                Fiber_Straightness,Geodesic_Aspect_Ratio,Intensity,
                Length,Perimeter,Ratio_Blue_Green,Ratio_Red_Blue,
                Ratio_Red_Green,Roughness,Sigma_Intensity,Sum_Intensity,
                Symmetry,Transparency,Width, id, data, light) %>%
  pivot_longer(cols=-c(id,species, data, light), names_to="variable") %>%
  dplyr::group_by(variable,species,data,light) %>%
  mutate(iqr = IQR(value),
         first_quartile = quantile(value, probs = 0.25),
         third_quartile = quantile(value, probs = 0.75),
         outlier = ifelse(value<first_quartile-1.5*iqr | value>third_quartile+1.5*iqr,T,F))

outliers <- dd_long %>%
  dplyr::filter(outlier==T) %>%
  dplyr::group_by(species, id,data,light) %>%
  dplyr::summarise(n = n()) %>%
  dplyr::filter(n>4)
## `summarise()` has grouped output by 'species', 'id', 'data'. You can override using the `.groups` argument.
# table(outliers$species)
# nrow(outliers)/nrow(dd)

dd_filtered <- dd %>%
  dplyr::filter(!is.element(id,outliers$id))

dd$removed_outliers <- F
dd_filtered$removed_outliers <- T

dd_comparison <- rbind(dd,dd_filtered)

# dd_comparison %>%
#   ggplot(aes(log10(Area_ABD), col=removed_outliers))+
#   geom_histogram(aes(fill=removed_outliers, y=..density..), position = "identity", alpha=0.3) +
#   geom_boxplot(outlier.alpha = 0.3) +
#   theme_bw() +
#   facet_wrap(species~interaction(light,data), scales = "free_y")  +
#   geom_vline(xintercept = 1, col="black")
# 
# dd_filtered %>%
#   ggplot(aes(log10(Area_ABD)))+
#   geom_density(aes(col=species))

4.3 Sub-sampling to create training data

We have data from several time points and light settings. The goals is that in the final training data these are somewhat equally represented, regardless that the number of individuals tracked per species and time point can vary a lot (in other words: “even it out” across species, time point and light setting). This is important, as otherwise it can be that groups that are more abundant are weighted more.

dd <- dd %>%
  mutate(data.light = interaction(data,light, drop = T),
         data.light.species=interaction(data,light,species, drop = T))

print("number of individuals per species per date and light setting")
## [1] "number of individuals per species per date and light setting"
tab <- table(dd$data.light, dd$species)
tab
##                
##                 airbubbles Chlamydomonas ChlamydomonasClumps Coleps_irchel
##   20220406.1%            0          2035                   0             0
##   20220201.18%           0          3023                   0             0
##   20220201.30%           0          6004                   0             0
##   20220316.30%           0          6463                   0             0
##   20220406.30%           0          4391                   0             0
##   Late 2020.30%        112          1099                 359           475
##   20220316.6%            0          4410                   0             0
##                
##                 Colpidium ColpidiumVacuoles Cosmarium Cryptomonas Debris
##   20220406.1%           0                 0        93        2162      0
##   20220201.18%          0                 0       176        2987      0
##   20220201.30%          0                 0      1103           0      0
##   20220316.30%          0                 0       328        3334      0
##   20220406.30%          0                 0       429        2901      0
##   Late 2020.30%       101               264       341         781    339
##   20220316.6%           0                 0       164        2239      0
##                
##                 Desmodesmus Dexiostoma DigestedAlgae DividingChlamydomonas
##   20220406.1%           874          0             0                     0
##   20220201.18%         1113          0             0                     0
##   20220201.30%         5122          0             0                     0
##   20220316.30%         4068          0             0                     0
##   20220406.30%         3655          0             0                     0
##   Late 2020.30%        1051        160           154                   398
##   20220316.6%          2573          0             0                     0
##                
##                 Loxocephallus Monoraphidium OtherCiliate Small_unidentified
##   20220406.1%               0           578            0                  0
##   20220201.18%              0          1916            0                  0
##   20220201.30%              0           746            0                  0
##   20220316.30%              0          1062            0                  0
##   20220406.30%              0          1026            0                  0
##   Late 2020.30%           224          1489          164               4642
##   20220316.6%               0           855            0                  0
##                
##                 Staurastrum1 Staurastrum2 Tetrahymena
##   20220406.1%            212           91           0
##   20220201.18%           111          124           0
##   20220201.30%           589          126           0
##   20220316.30%           244           52           0
##   20220406.30%           604           89           0
##   Late 2020.30%          510           75         108
##   20220316.6%            225           94           0
print("Sum of individuals per species")
## [1] "Sum of individuals per species"
colsums <- colSums(tab)
colsums
##            airbubbles         Chlamydomonas   ChlamydomonasClumps 
##                   112                 27425                   359 
##         Coleps_irchel             Colpidium     ColpidiumVacuoles 
##                   475                   101                   264 
##             Cosmarium           Cryptomonas                Debris 
##                  2634                 14404                   339 
##           Desmodesmus            Dexiostoma         DigestedAlgae 
##                 18456                   160                   154 
## DividingChlamydomonas         Loxocephallus         Monoraphidium 
##                   398                   224                  7672 
##          OtherCiliate    Small_unidentified          Staurastrum1 
##                   164                  4642                  2495 
##          Staurastrum2           Tetrahymena 
##                   651                   108

Clearly, the amount of individuals per class varies a lot… I think the most limiting factor is Staurastrum2, i.e. an algae we are interested in but of which only have rather few individuals. For now at least I try to take 2 * Staurastrum2 as the number of individuals per species to be included (if a species/class has less than that all individuals are included).

Within a class I use that number so that roughly equally many individuals across time steps and light settings are included. As a last thing I divide the data into a training dataset and in a test dataset.

n <- colsums["Staurastrum2"]*2
print(paste0("Max number of individuals used per species (if there are fewer for a species, then all are used): n=",n))
## [1] "Max number of individuals used per species (if there are fewer for a species, then all are used): n=1302"
num_ind_per_class <- dd %>% group_by(species) %>% 
  summarize(num_class = length(unique(data.light.species)),
            num_ind_per_class = floor(n/num_class)) %>%
  select(species,num_ind_per_class)

num_ind_per_class_vec <- c(num_ind_per_class$num_ind_per_class)
names(num_ind_per_class_vec) <- num_ind_per_class$species

set.seed(53)

split_up <- split(dd, f = dd$data.light.species)
split_up <- lapply(split_up, function(x) {
  species <- unique(x$species)
  x %>% sample_n(ifelse(nrow(x) < num_ind_per_class_vec[species], nrow(x), num_ind_per_class_vec[species]))})
trainingdata <- do.call("rbind", split_up)

print("Subsampled: number of individuals per species per date and light setting")
## [1] "Subsampled: number of individuals per species per date and light setting"
tab2 <- table(trainingdata$data.light, trainingdata$species)
tab2
##                
##                 airbubbles Chlamydomonas ChlamydomonasClumps Coleps_irchel
##   20220406.1%            0           186                   0             0
##   20220201.18%           0           186                   0             0
##   20220201.30%           0           186                   0             0
##   20220316.30%           0           186                   0             0
##   20220406.30%           0           186                   0             0
##   Late 2020.30%        112           186                 359           475
##   20220316.6%            0           186                   0             0
##                
##                 Colpidium ColpidiumVacuoles Cosmarium Cryptomonas Debris
##   20220406.1%           0                 0        93         217      0
##   20220201.18%          0                 0       176         217      0
##   20220201.30%          0                 0       186           0      0
##   20220316.30%          0                 0       186         217      0
##   20220406.30%          0                 0       186         217      0
##   Late 2020.30%       101               264       186         217    339
##   20220316.6%           0                 0       164         217      0
##                
##                 Desmodesmus Dexiostoma DigestedAlgae DividingChlamydomonas
##   20220406.1%           186          0             0                     0
##   20220201.18%          186          0             0                     0
##   20220201.30%          186          0             0                     0
##   20220316.30%          186          0             0                     0
##   20220406.30%          186          0             0                     0
##   Late 2020.30%         186        160           154                   398
##   20220316.6%           186          0             0                     0
##                
##                 Loxocephallus Monoraphidium OtherCiliate Small_unidentified
##   20220406.1%               0           186            0                  0
##   20220201.18%              0           186            0                  0
##   20220201.30%              0           186            0                  0
##   20220316.30%              0           186            0                  0
##   20220406.30%              0           186            0                  0
##   Late 2020.30%           224           186          164               1302
##   20220316.6%               0           186            0                  0
##                
##                 Staurastrum1 Staurastrum2 Tetrahymena
##   20220406.1%            186           91           0
##   20220201.18%           111          124           0
##   20220201.30%           186          126           0
##   20220316.30%           186           52           0
##   20220406.30%           186           89           0
##   Late 2020.30%          186           75         108
##   20220316.6%            186           94           0
print("Subsampled: Sum of individuals per species")
## [1] "Subsampled: Sum of individuals per species"
colsums2 <- colSums(tab2)
colsums2
##            airbubbles         Chlamydomonas   ChlamydomonasClumps 
##                   112                  1302                   359 
##         Coleps_irchel             Colpidium     ColpidiumVacuoles 
##                   475                   101                   264 
##             Cosmarium           Cryptomonas                Debris 
##                  1177                  1302                   339 
##           Desmodesmus            Dexiostoma         DigestedAlgae 
##                  1302                   160                   154 
## DividingChlamydomonas         Loxocephallus         Monoraphidium 
##                   398                   224                  1302 
##          OtherCiliate    Small_unidentified          Staurastrum1 
##                   164                  1302                  1227 
##          Staurastrum2           Tetrahymena 
##                   651                   108
trainingdata <- trainingdata[complete.cases(trainingdata),]

# A stratified random split of the data
idx_train <- createDataPartition(trainingdata$species,
                                 p = 0.75, # percentage of data as training
                                 list = FALSE)

testdata <- trainingdata[-idx_train,]
trainingdata <- trainingdata[idx_train,]

5 Loading in species compositions

species <- unique(dd_all$species)
species <- species[!is.element(species,c("airbubbles","ColpidiumVacuoles","Debris","Small_unidentified",
                                         "OtherCiliate","ChlamydomonasClumps","DigestedAlgae","DividingChlamydomonas"))]
compositions <- read_csv("../compositions.csv")
## Rows: 15 Columns: 20
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr  (1): composition
## dbl (19): richness, Chlamydomonas, Cryptomonas, Monoraphidium, Cosmarium, St...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
comp_name <- compositions$composition

compositions <- compositions[,species]

compositions_list <- apply(compositions, 1, function(x){
  idx <- which(x==1)
  names(idx)
})

names(compositions_list) <- comp_name

6 Classifiers

6.1 Classifier plotting functions

classifier_plot_svm <- function(table, combination.nr){
  # cm <- classifier$confusion
  cm <- table
  ncol <- ncol(cm)
  cm <- apply(cm, 1, function(x) x/sum(x))
  
  cm_long <- data.frame(Predicted = factor(rep(rownames(cm),ncol), levels = rownames(cm)),
                        Truth = factor(rep(colnames(cm), each=ncol), levels = colnames(cm)),
                        Classification = as.vector(cm))
  
  plot <- cm_long %>%
    ggplot(aes(Predicted,Truth,fill=Classification))+
    geom_tile() +
    geom_text(aes(label = round(Classification, 3)), col="white") +
    scale_x_discrete(position = "top") +
    scale_y_discrete(limits = rev(levels(cm_long$Truth))) +
    scale_fill_viridis(option = "D", end=.8, discrete=FALSE)+
    theme(axis.text.x = element_text(angle = 90, hjust = 0))+
    theme(legend.text = element_text(size=14),
          axis.title = element_text(size=14),
          title = element_text(size=16),
          axis.text = element_text(size=13))+
    labs(title=paste("PPV of",combination.nr),fill="PPV")
  
  return(plot)
}


classifier_assessment_plot <- function(cf, combination.nr){

  cf.df <- cf$byClass[,1:4] %>%
    as.data.frame() %>%
    mutate(Species = gsub("Class: ", "", rownames(cf$byClass[,1:5]))) %>%
    rename("NPV" = "Neg Pred Value", "PPV" = "Pos Pred Value",
           "Sens" = "Sensitivity", "Spec" = "Specificity") %>%
    pivot_longer(cols = 1:4) %>%
    mutate(col = ifelse(value>=0.9,"1",
                        ifelse(value<0.9 & value>=0.8,"2",
                               ifelse(value<0.8 & value>=0.7,"3","4"))))

  plot <- cf.df %>%
    ggplot(aes(name,Species,fill=col))+
    geom_tile() +
    geom_text(aes(label = round(value, 3)), col="white") +
    scale_x_discrete(position = "top") +
    scale_y_discrete(limits = rev(levels(as.factor(cf.df$Species)))) +
    scale_fill_manual(values = c("#006837","#66bd63","#f46d43","#a50026"), breaks = c("1","2","3","4")) +
    theme(legend.text = element_text(size=14),
          title = element_text(size = 16),
          axis.title = element_blank(),
          axis.text = element_text(size=13),
          legend.position = "none")+
    labs(title=combination.nr, fill="")
  
  return(plot)
}

6.2 Training of classifiers

formula <- factor(species) ~ Area_ABD + Area_Filled + Aspect_Ratio +
      Average_Blue + Average_Green + Average_Red + Circle_Fit +
      Circularity + Compactness +
      Convexity + Diameter_ABD + Diameter_ESD + Edge_Gradient +
      Elongation + Feret_Angle_Max + Feret_Angle_Min + Fiber_Curl +
      Fiber_Straightness + Geodesic_Aspect_Ratio + Intensity + Length +
      Perimeter + Ratio_Blue_Green + Ratio_Red_Blue + Ratio_Red_Green +
      Roughness + Sigma_Intensity + Sum_Intensity + Symmetry + Transparency +
      Width

set.seed(8765)
classifiers_18c <- list()
classifiers_18c_plots <- list()
classifiers_18c_assess_plots <- list()


for(i in 1:length(compositions_list)){
  
  if("Colpidium" %in% compositions_list[[i]]) {
    sub_trainingdata <- trainingdata %>%
      filter(species %in% c(compositions_list[[i]],"airbubbles","ColpidiumVacuoles","Debris","Small_unidentified",
                            "OtherCiliate","ChlamydomonasClumps","DigestedAlgae","DividingChlamydomonas"))
    sub_testdata <- testdata %>%
      filter(species %in% c(compositions_list[[i]],"airbubbles","ColpidiumVacuoles","Debris","Small_unidentified",
                            "OtherCiliate","ChlamydomonasClumps","DigestedAlgae","DividingChlamydomonas"))
  } else {
    sub_trainingdata <- trainingdata %>%
      filter(species %in% c(compositions_list[[i]],"airbubbles","Debris","OtherCiliate","ChlamydomonasClumps","Small_unidentified",
                            "DigestedAlgae","DividingChlamydomonas"))
    sub_testdata <- testdata %>%
      filter(species %in% c(compositions_list[[i]],"airbubbles","Debris","OtherCiliate","ChlamydomonasClumps","Small_unidentified",
                            "DigestedAlgae","DividingChlamydomonas"))
  }

  sub_trainingdata$species <- factor(sub_trainingdata$species)
  sub_testdata$species <- factor(sub_testdata$species)
  
  # split_up <- split(sub_trainingdata, f = sub_trainingdata$species)
  # subsamples <- lapply(split_up, function(x) {
  #   x %>% sample_n(ifelse(nrow(x) < 500, nrow(x), 500))})
  # sub_trainingdata <- do.call("rbind", subsamples)
  
  # # A stratified random split of the data
  # idx_train <- createDataPartition(sub_trainingdata$species,
  #                                  p = 0.7, # percentage of data as training
  #                                  list = FALSE)
  # 
  # sub_testdata <- sub_trainingdata[-idx_train,]
  # sub_trainingdata <- sub_trainingdata[idx_train,]
  
  obj <- tune(svm, formula, data = sub_trainingdata, kernel="radial",
            ranges = list(gamma = 2^(-8:2), cost = 2^(1:10)), probability = TRUE,
            tunecontrol = tune.control(sampling = "fix", best.model = T))
  
  classifiers_18c[[i]] <- obj$best.model
  
  cf <- caret::confusionMatrix(predict(obj$best.model, newdata=sub_testdata %>% select(-species)),
                       sub_testdata$species)
  
  classifiers_18c_plots[[i]] <- classifier_plot_svm(table = cf$table,
                                                        combination.nr = names(compositions_list)[[i]])
  
  classifiers_18c_assess_plots[[i]] <- classifier_assessment_plot(cf, names(compositions_list)[[i]])
}

names(classifiers_18c_plots) <- names(classifiers_18c) <-
  names(classifiers_18c_assess_plots) <- comp_name

6.3 Assessing of classifiers

There are mainly 4 measures:

  • Sensitivity: the probability that an individuals is classified as species X given that the it is of species X: P(Classified as species X | is of species X)

  • Specificity: the probability that an individuals is not classified as species X given that it is not of species X: P(Not classified as species X | is not of species X)

  • Positive Predictive Value PPV: the probability that an individual is of species X given that it has been classified as species X: P(is of species X | is classified as species X)

  • Negative Predictive Value NPV: the probability that an individuals is not of species X given that it has not been classified as species X: P(is not of species X | has not been classified as species X)

PPV is the most important for us!

library(patchwork)
for(i in 1:length(classifiers_18c_plots)){
  print(classifiers_18c_plots[[i]] + classifiers_18c_assess_plots[[i]] + 
    plot_layout(widths = c(7,3)))
}

7 Save classifiers

saveRDS(classifiers_18c, "svm_flowcam_classifiers_18c_16x_20220316_MergedData.rds")
# cl <- readRDS("classifiers_18c_25x.rds")
# labels <- dd_all %>% 
#   group_by(species) %>%  
#   summarise(xPos = max(Area_Filled),
#             yPos = max((density(Area_Filled))$y))
# 
# library(directlabels)
# direct.label(qplot(Area_Filled,data=dd_all,colour=species,geom="density",log="x"))